Load in required packages¶

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
In [2]:
# Import CSV with renamed columns
df = pd.read_csv("bike_data.csv")
df.info()
#Check the summary statistics of the data
df.describe()
# Clean up some columns
#day/month/year
df["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y")
df.info()
#combine date column and hour column
df["datetime"] = df["date"] + pd.to_timedelta(df["hour"], unit="h")
df
#pd.set_option("future.no_silent_downcasting", True)
# Similar to is_holiday, map is_functioning to True and False
df["is_holiday"] = df["is_holiday"].replace({"No Holiday": False,
                                             "Holiday": True}).astype(bool)
df["is_functioning"] = df["is_functioning"].astype(bool)
df.info()
# Only keep observations where the system is functioning
df = df.query("is_functioning")
df.shape

# Print out the result
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 8760 non-null   object 
 1   n_rented_bikes       8760 non-null   int64  
 2   hour                 8760 non-null   int64  
 3   temperature_celsius  8760 non-null   float64
 4   humidity_pct         8760 non-null   int64  
 5   wind_speed_mps       8760 non-null   float64
 6   visibility_10m       8760 non-null   int64  
 7   dew_point_temp_c     8760 non-null   float64
 8   solar_radiation      8760 non-null   float64
 9   rainfall_mm          8760 non-null   float64
 10  snowfall_cm          8760 non-null   float64
 11  season               8760 non-null   object 
 12  is_holiday           8760 non-null   object 
 13  is_functioning       8760 non-null   object 
dtypes: float64(6), int64(4), object(4)
memory usage: 958.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date                 8760 non-null   datetime64[ns]
 1   n_rented_bikes       8760 non-null   int64         
 2   hour                 8760 non-null   int64         
 3   temperature_celsius  8760 non-null   float64       
 4   humidity_pct         8760 non-null   int64         
 5   wind_speed_mps       8760 non-null   float64       
 6   visibility_10m       8760 non-null   int64         
 7   dew_point_temp_c     8760 non-null   float64       
 8   solar_radiation      8760 non-null   float64       
 9   rainfall_mm          8760 non-null   float64       
 10  snowfall_cm          8760 non-null   float64       
 11  season               8760 non-null   object        
 12  is_holiday           8760 non-null   object        
 13  is_functioning       8760 non-null   object        
dtypes: datetime64[ns](1), float64(6), int64(4), object(3)
memory usage: 958.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date                 8760 non-null   datetime64[ns]
 1   n_rented_bikes       8760 non-null   int64         
 2   hour                 8760 non-null   int64         
 3   temperature_celsius  8760 non-null   float64       
 4   humidity_pct         8760 non-null   int64         
 5   wind_speed_mps       8760 non-null   float64       
 6   visibility_10m       8760 non-null   int64         
 7   dew_point_temp_c     8760 non-null   float64       
 8   solar_radiation      8760 non-null   float64       
 9   rainfall_mm          8760 non-null   float64       
 10  snowfall_cm          8760 non-null   float64       
 11  season               8760 non-null   object        
 12  is_holiday           8760 non-null   bool          
 13  is_functioning       8760 non-null   bool          
 14  datetime             8760 non-null   datetime64[ns]
dtypes: bool(2), datetime64[ns](2), float64(6), int64(4), object(1)
memory usage: 906.9+ KB
Out[2]:
(8760, 15)

Visualize bike rentals over time¶

In [5]:
# Create a line plot of rented bikes over time
px.line(df,x="datetime",y="n_rented_bikes")
In [6]:
# Calculate the total number of rented bikes per day
df_day = df.groupby("date")["n_rented_bikes"].sum().reset_index()
by_day = df.groupby("date", as_index=False).agg({"n_rented_bikes": "sum"})
per_day = df.groupby("date")["n_rented_bikes"].sum()

# Create a line plot showing total number of bikes per day over time
In [7]:
# Copy the previous chain of manipulations and add season as a variable to group by
by_day_season = df.groupby(["date","season"], as_index=False).agg({"n_rented_bikes": "sum"})

# Copy the code for the previous line plot and map season to color
px.line(by_day_season, x="date", y="n_rented_bikes", color="season")

# Copy the code for the previous line plot and map season to color

Explore the relation between weather and rentals¶

In [18]:
#Query df to only keep observations at noon
noon_rides = df.query('hour == 12')

# Create a scatter plot showing temperature against number of rented bikes
# Create a subplot figure with 2 rows and 2 columns
fig = make_subplots(rows=2, cols=2, subplot_titles=("Temperature vs Rented Bikes (with Trendline)",
                                                    "Date vs Rented Bikes (colored by Temperature)",
                                                    "Temperature vs Rented Bikes (no Trendline)",
                                                    "Season vs Rented Bikes"
                                                   ))
#Graph1: Scatter plot with trendline
fig1 = px.scatter(noon_rides, x='temperature_celsius', y='n_rented_bikes', trendline='lowess')
fig.add_trace(fig1.data[0], row=1, col=1)
fig.add_trace(fig1.data[1], row=1, col=1)

#Graph2: Scatter plot with color-coded temperature
fig2 = px.scatter(df, x="datetime", y="n_rented_bikes", color="temperature_celsius")
fig.add_trace(fig2.data[0], row=1, col=2)

#Graph3:Simple scatter plot
fig3 = px.scatter(noon_rides, x="temperature_celsius", y="n_rented_bikes")
fig.add_trace(fig3.data[0], row=2, col=1)

#Graph4:Scatter plot with seasons
fig4 = px.scatter(df, x="season", y="n_rented_bikes")
fig.add_trace(fig4.data[0], row=2, col=2)

#Update layout
fig.update_layout(height=800, width=1000, title_text="Subplots of Rented Bikes Data", showlegend=True)
fig.show()

Explore typical daily usage pattern¶

In [22]:
# Calculate the average number of rented bikes per hour
time_of_day = df.groupby("hour", as_index=False).agg({"n_rented_bikes": "mean"})

# Create a bar chart showing the usage pattern
px.bar(time_of_day, x="hour", y = "n_rented_bikes")
In [23]:
# Copy and adapt the previous query to take into account the season
time_of_day_season = df.groupby(["hour", "season"], as_index=False).agg({"n_rented_bikes": "mean"})

# Copy and adapt the code for the previous bar chart to show usage pattern per season
px.bar(time_of_day_season, x="hour", y="n_rented_bikes", color="season", facet_col="season")

Extra: is New Year's Eve different?¶

In [24]:
# New Years dates
new_years_start = datetime(2017,12,31,12)
new_years_end = datetime(2018,1,1,12)

new_year = df.query("@new_years_start <= datetime <= @new_years_end") 
px.bar(new_year,x = 'datetime', y = 'n_rented_bikes')
In [ ]: